DE and DTU Analyses
Load packages
First we will load in the packages needed to execute this analysis
library(rmdformats)
library(dplyr)
library(readr)
library(tidyr)
library(stringr)
library(magrittr)
library(ggplot2)
library(ggrepel)
library(viridis)
library(edgeR)
library(tximport)
library(DRIMSeq)
library(stageR)
library(ggfortify)
library(pheatmap)
library(cowplot)
library(ggbeeswarm)
library(tibble)
library(tictoc)
library(here)Some methods will be called “hisat2”, this just means that bootstrapping estimates have not been taken into account. Nothing super special going on.
I have also defined several functions here for quality of life purposes
# source(here("R/get_dtelist.R"))
# source(here("R/get_filter_plot.R"))
source(here("R/de_testing.R"))
source(here("R/de_plotting.R"))
source(here("R/get_dtu_tximport.R"))
source(here("R/get_stager_res.R"))
source(here("R/draw_venn_STAR.R"))
# source(here("R/get_pca_obj_star.R"))
# source(here("R/get_pca_plot_star.R"))
# source(here("R/plot_blandr.R"))
# source(here("R/cor_funcs.R"))
# source(here("R/plot_blandr.R"))
# source(here("R/theme_justin.R"))
# From 01_Annotations
gene_txp_anno <- read_csv(here("data/grch38_103_df.csv.gz"))
txp_gene_ensdb_lengths <- read_csv(
here("data/txp_gene_ensdb_lengths.csv.gz")
)
transcript_key <- dplyr::select(txp_gene_ensdb_lengths,
c("transcript_id" = tx_id,
"transcript_name" = tx_name))
# From 02_DTElist
hisat2_dtelist <- read_rds(here("data/hisat2_dtelist.rds"))
kallisto_dtelist <- read_rds(here("data/kallisto_dtelist.rds"))
salmon_dtelist <- read_rds(here("data/sa_dtelist.rds"))
star_dtelist <- read_rds(here("data/star_dtelist.rds"))R Markdown
DTE and DTU Analysis
The question we are asking in this study is, “does the choice and implenetation of an alignment-quantification algorithm inherently bias the RNA-seq counts and downstream statistical tests of differential expression and usage?” From what others have already observed in numerous benchmarking studies is that different methods perform differently, use different algorithms, and are even tailored to answering different biological questions about the data. However, the impacts have been infrequently tested for gene expression, and not even considered for transcript expression. My hypothesis is, “the usage of different alignment-quantification methods will not produce any major difference in gene or transcript expression and downstream statistical analysis.” To tackle this hypothesis, I will conduct differential gene expression (DGE), differential transcript expression (DTE), and differential transcript usage (DTU). I will overlap the results to see how they compare at different perspectives. If there are any observable differences then I will interrogate the differences by comparing the relation of the variable genes with any biological differences to see if there is an overrepresentation of any biological bias. The four methods chosen are Kallisto, Salmon, Selective Alignment-Salmon, and Hisat2-Salmon
The code below details the differential expression and usage in each method ## Hisat2
# hisat2 ----
hisat2_lmfit <- hisat2_dtelist %>%
de_lmfit() %T>%
write_csv(here("data/hisat2_lmfit.csv"))
hisat2_lmfit_sig <- hisat2_lmfit %>%
dplyr::filter(DE == TRUE)
# Takes 25 minutes on my Macbook
# Takes 6 minutes on my Lenovo
tic("DTU")
message("Starting DTU now")
hisat2_dtu <- read_csv(here("data/dtu_cts_hisat2_salmon.csv.gz")) %>%
get_dtu_tximport(annotation = gene_txp_anno) %>%
get_stager_res() %T>%
write_csv(here("data/hisat2_DTU.csv"))
message("DTU run finishing now")
toc()## DTU: 391.261 sec elapsed
# DTU: 1319.4 seconds elapsed on Macbook
# DTU: 361.75 seconds elapsed on Lenovo
hisat2_dtu_sig <- hisat2_dtu %>%
dplyr::filter(DTU == TRUE)
hisat2_dtu_sig_txp <- hisat2_dtu %>%
dplyr::filter(txpDTU == TRUE)label_data <- hisat2_lmfit %>%
dplyr::filter(FDR < 0.05 & abs(logFC) > 1.5) %>%
mutate(group = case_when(
FDR < 0.05 & logFC > 1 ~ "Up Regulated",
FDR < 0.05 & logFC < -1 ~ "Down Regulated",
.default = "Not Significant"
))
hisat2_ma <- hisat2_lmfit %>%
plot_ma() +
geom_label_repel(data = label_data,
aes(x = AveExpr, y = logFC, label = transcript_name),
size = 3,
max.overlaps = Inf,
force_pull = 0.5,
force = 3) +
theme(legend.position = "none")
hisat2_volc <- hisat2_lmfit %>%
plot_volc() +
geom_label_repel(data = label_data,
aes(x = logFC, y = -log10(FDR), label = transcript_name),
size = 3,
max.overlaps = Inf,
force_pull = 0.5,
force = 3) +
theme(legend.position = "none")
hisat2_legend <- cowplot::get_legend(plot_ma(hisat2_lmfit))
hisat2_de <- cowplot::plot_grid(hisat2_volc, hisat2_ma, hisat2_legend,
rel_widths = c(1, 1, 0.3),
nrow = 1)
hisat2_dehisat2_de %>%
ggsave(filename = "hisat2_ma_volc_plot.png",
path = here("figures/"),
device = "png",
height = 140,
width = 330,
units = "mm",
dpi = 400)Salmon
salmon_lmfit <- salmon_dtelist %>%
de_lmfit() %T>%
write_csv(here("data/sa_lmfit.csv"))
salmon_lmfit_sig <- salmon_lmfit %>%
dplyr::filter(DE == TRUE)
salmon_dtu <- read_csv(
here("data/dtu_cts_sa.csv.gz")
) %>%
get_dtu_tximport(annotation = gene_txp_anno) %>%
get_stager_res() %T>%
write_csv(here("data/salmon_DTU.csv"))
salmon_dtu_sig <- salmon_dtu %>%
dplyr::filter(DTU == TRUE)
salmon_dtu_sig_txp <- salmon_dtu %>%
dplyr::filter(txpDTU == TRUE)label_data <- salmon_lmfit %>%
dplyr::filter(FDR < 0.05 & abs(logFC) > 6) %>%
mutate(group = case_when(
FDR < 0.05 & logFC > 1 ~ "Up Regulated",
FDR < 0.05 & logFC < -1 ~ "Down Regulated",
.default = "Not Significant"
)) %>%
dplyr::filter(!str_detect(seqnames, "CHR"))
salmon_ma <- salmon_lmfit %>%
plot_ma() +
geom_label_repel(data = label_data,
aes(x = AveExpr, y = logFC, label = transcript_name),
size = 3,
max.overlaps = Inf,
force_pull = 0.5,
force = 3) +
theme(legend.position = "none")
salmon_volc <- salmon_lmfit %>%
plot_volc() +
geom_label_repel(data = label_data,
aes(x = logFC, y = -log10(FDR), label = transcript_name),
size = 3,
max.overlaps = Inf,
force_pull = 0.5,
force = 3) +
theme(legend.position = "none")
salmon_legend <- cowplot::get_legend(plot_ma(salmon_lmfit))
salmon_de <- cowplot::plot_grid(salmon_volc, salmon_ma, salmon_legend,
rel_widths = c(1, 1, 0.3),
nrow = 1)
salmon_deSTAR
star_lmfit <- star_dtelist %>%
de_lmfit() %T>%
write_csv(here("data/star_lmfit.csv"))
star_lmfit_sig <- star_lmfit %>%
dplyr::filter(DE == TRUE)
star_dtu <- read_csv(
here("data/dtu_cts_star_salmon.csv.gz")
) %>%
get_dtu_tximport(annotation = gene_txp_anno) %>%
get_stager_res() %T>%
write_csv(here("data/star_DTU.csv"))
star_dtu_sig <- star_dtu %>%
dplyr::filter(DTU == TRUE)
star_dtu_sig_txp <- star_dtu %>%
dplyr::filter(txpDTU == TRUE)label_data <- star_lmfit %>%
dplyr::filter(FDR < 0.05 & abs(logFC) > 6) %>%
mutate(group = case_when(
FDR < 0.05 & logFC > 1 ~ "Up Regulated",
FDR < 0.05 & logFC < -1 ~ "Down Regulated",
.default = "Not Significant"
)) %>%
dplyr::filter(!str_detect(seqnames, "CHR"))
star_ma <- star_lmfit %>%
plot_ma() +
geom_label_repel(data = label_data,
aes(x = AveExpr, y = logFC, label = transcript_name),
size = 3,
max.overlaps = Inf,
force_pull = 0.5,
force = 3) +
theme(legend.position = "none")
star_volc <- star_lmfit %>%
plot_volc() +
geom_label_repel(data = label_data,
aes(x = logFC, y = -log10(FDR), label = transcript_name),
size = 3,
max.overlaps = Inf,
force_pull = 0.5,
force = 3) +
theme(legend.position = "none")
star_legend <- cowplot::get_legend(plot_ma(star_lmfit))
star_de <- cowplot::plot_grid(star_volc, star_ma, star_legend,
rel_widths = c(1, 1, 0.3),
nrow = 1)
star_deKallisto
kallisto_lmfit <- kallisto_dtelist %>%
de_lmfit() %T>%
write_csv(here("data/kallisto_lmfit.csv"))
kallisto_lmfit_sig <- kallisto_lmfit %>%
dplyr::filter(DE == TRUE)
kallisto_dtu <- read_csv(
here("data/dtu_cts_kallisto.csv.gz")
) %>%
get_dtu_tximport(annotation = gene_txp_anno) %>%
get_stager_res() %T>%
write_csv(here("data/kallisto_DTU.csv"))
kallisto_dtu_sig <- kallisto_dtu %>%
dplyr::filter(DTU == TRUE)
kallisto_dtu_sig_txp <- kallisto_dtu %>%
dplyr::filter(txpDTU == TRUE)label_data <- kallisto_lmfit %>%
dplyr::filter(FDR < 0.05 & abs(logFC) > 4) %>%
mutate(group = case_when(
FDR < 0.05 & logFC > 1 ~ "Up Regulated",
FDR < 0.05 & logFC < -1 ~ "Down Regulated",
.default = "Not Significant"
))
kallisto_ma <- kallisto_lmfit %>%
plot_ma() +
geom_label_repel(data = label_data,
aes(x = AveExpr, y = logFC, label = transcript_name),
size = 3,
max.overlaps = Inf,
force_pull = 0.5,
force = 3) +
theme(legend.position = "none")
kallisto_volc <- kallisto_lmfit %>%
plot_volc() +
geom_label_repel(data = label_data,
aes(x = logFC, y = -log10(FDR), label = transcript_name),
size = 3,
max.overlaps = Inf,
force_pull = 0.5,
force = 3) +
theme(legend.position = "none")
kallisto_legend <- cowplot::get_legend(plot_ma(kallisto_lmfit))
kallisto_de <- cowplot::plot_grid(kallisto_volc, kallisto_ma, kallisto_legend,
rel_widths = c(1, 1, 0.3),
nrow = 1)
kallisto_deVenn Diagram (Figure 4)
venn_dte <- draw_venn(hisat2_res = hisat2_lmfit_sig,
kallisto_res = kallisto_lmfit_sig,
star_res = star_lmfit_sig,
salmon_res = salmon_lmfit_sig) %>%
as_grob()
ggsave(venn_dte,
filename = "figure3_venn_dte.png",
path = here("figures/"),
device = "png",
height = 10,
width = 13,
units = "cm")
hisat2_dtu_sig_gene <- hisat2_dtu %>%
dplyr::filter(DTU == TRUE) %>%
dplyr::select(Gene, geneFDR, DTU) %>%
dplyr::rename("Transcript" = Gene) %>%
distinct()
kallisto_dtu_sig_gene <- kallisto_dtu %>%
dplyr::filter(DTU == TRUE) %>%
dplyr::select(Gene, geneFDR, DTU) %>%
dplyr::rename("Transcript" = Gene) %>%
distinct()
star_dtu_sig_gene <- star_dtu %>%
dplyr::filter(DTU == TRUE) %>%
dplyr::select(Gene, geneFDR, DTU) %>%
dplyr::rename("Transcript" = Gene) %>%
distinct()
salmon_dtu_sig_gene <- salmon_dtu %>%
dplyr::filter(DTU == TRUE) %>%
dplyr::select(Gene, geneFDR, DTU) %>%
dplyr::rename("Transcript" = Gene) %>%
distinct()
venn_dtu <- draw_venn(hisat2_res = hisat2_dtu_sig_gene,
kallisto_res = kallisto_dtu_sig_gene,
star_res = star_dtu_sig_gene,
salmon_res = salmon_dtu_sig_gene) %>%
as_grob()## gTree[GRID.gTree.938]
ggsave(venn_dtu,
filename = "figure3_venn_dtu.png",
path = here("figures/"),
device = "png",
height = 10,
width = 13,
units = "cm")sig_dtu_transcripts
hisat2_dtu_sig_transcripts <- hisat2_dtu %>%
dplyr::filter(txpDTU == TRUE) %>%
dplyr::select(Transcript, txpFDR, txpDTU) %>%
distinct()
kallisto_dtu_sig_transcripts <- kallisto_dtu %>%
dplyr::filter(txpDTU == TRUE) %>%
dplyr::select(Transcript, txpFDR, txpDTU) %>%
distinct()
star_dtu_sig_transcripts <- star_dtu %>%
dplyr::filter(txpDTU == TRUE) %>%
dplyr::select(Transcript, txpFDR, txpDTU) %>%
distinct()
salmon_dtu_sig_transcripts <- salmon_dtu %>%
dplyr::filter(txpDTU == TRUE) %>%
dplyr::select(Transcript, txpFDR, txpDTU) %>%
distinct()
venn_dtu_txp <- draw_venn(hisat2_res = hisat2_dtu_sig_transcripts,
kallisto_res = kallisto_dtu_sig_transcripts,
star_res = star_dtu_sig_transcripts,
salmon_res = salmon_dtu_sig_transcripts) %>%
as_grob()## gTree[GRID.gTree.966]
Cowplot
venn_cowplot <- cowplot::plot_grid(
venn_dte,
venn_dtu_txp,
venn_dtu,
ncol = 1,
labels = c(" a. DTE Results",
" b. DTU Transcript-level Results",
" c. DTU Gene-level Results")
)
venn_cowplotVenn diagram for all
We make DE == TRUE for all rows here so that all genes are included in the venn diagram regardless of significance, just filtering
hisat2_venn_all <- hisat2_lmfit %>%
mutate(DE = TRUE)
salmon_venn_all <- salmon_lmfit %>%
mutate(DE = TRUE)
kallisto_venn_all <- kallisto_lmfit %>%
mutate(DE = TRUE)
star_venn_all <- star_lmfit %>%
mutate(DE = TRUE)
venn_dte_nofilter <- draw_venn(hisat2_res = hisat2_venn_all,
kallisto_res = kallisto_venn_all,
star_res = star_venn_all,
salmon_res = salmon_venn_all) %>%
as_grob()
ggsave(venn_dte_nofilter,
filename = "figure_supp_venn_all_dte_filter_pass.png",
path = here("figures/"),
device = "png",
height = 10,
width = 13,
units = "cm")
hisat2_dtu_sig_gene <- hisat2_dtu %>%
dplyr::select(Gene, geneFDR, DTU) %>%
dplyr::rename("Transcript" = Gene) %>%
distinct()
kallisto_dtu_sig_gene <- kallisto_dtu %>%
dplyr::select(Gene, geneFDR, DTU) %>%
dplyr::rename("Transcript" = Gene) %>%
distinct()
star_dtu_sig_gene <- star_dtu %>%
dplyr::select(Gene, geneFDR, DTU) %>%
dplyr::rename("Transcript" = Gene) %>%
distinct()
salmon_dtu_sig_gene <- salmon_dtu %>%
dplyr::select(Gene, geneFDR, DTU) %>%
dplyr::rename("Transcript" = Gene) %>%
distinct()
venn_dtu_nofilter <- draw_venn(hisat2_res = hisat2_dtu_sig_gene,
kallisto_res = kallisto_dtu_sig_gene,
star_res = star_dtu_sig_gene,
salmon_res = salmon_dtu_sig_gene) %>%
as_grob()Compare differences
hisat2_distinct <- hisat2_lmfit %>%
dplyr::select(transcript_id,
transcript_name,
AveExpr,
logFC,
P.Value,
FDR,
DE) %>%
dplyr::mutate(distinct = case_when(
!transcript_id %in% unique(
rbind(salmon_lmfit,
star_lmfit,
kallisto_lmfit)[["transcript_id"]]
) ~ "Distinct",
.default = "Not Distinct"))
hisat2_exclusive <- hisat2_distinct %>%
dplyr::filter(distinct == "Distinct")
hisat2_distinct_plot <- hisat2_distinct %>%
ggplot(aes(x = AveExpr, y = logFC, colour = distinct, shape = DE)) +
geom_point() +
geom_point(data = hisat2_exclusive) +
scale_colour_manual(values = c("navy", "lightblue")) +
labs(x = "Mean Transcript Expression (log2 CPM)",
y = "log2 Fold Change",
colour = "Exclusivity",
shape = "Significant?") +
theme_bw() +
theme(axis.title = element_text(colour = "black", size = 14),
axis.text = element_text(colour = "black", size = 12),
legend.title = element_text(colour = "black", size = 14,
face = "bold"),
legend.text = element_text(colour = "black", size = 12))
hisat2_distinct_plot# Save these plots
ggsave(plot = hisat2_distinct_plot,
filename = "hisat2_distinct_plot.png",
path = here("figures/"),
height = 100,
width = 200,
units = "mm",
device = "png",
dpi = 400)kallisto_distinct <- kallisto_lmfit %>%
dplyr::select(transcript_id,
transcript_name,
AveExpr,
logFC,
P.Value,
FDR,
DE) %>%
dplyr::mutate(distinct = case_when(
!transcript_id %in% unique(
rbind(salmon_lmfit,
star_lmfit,
hisat2_lmfit)[["transcript_id"]]
) ~ "Distinct",
.default = "Not Distinct"))
kallisto_exclusive <- kallisto_distinct %>%
dplyr::filter(distinct == "Distinct")
kallisto_distinct_plot <- kallisto_distinct %>%
ggplot(aes(x = AveExpr, y = logFC, colour = distinct, shape = DE)) +
geom_point() +
geom_point(data = kallisto_exclusive) +
scale_colour_manual(values = c("navy", "lightblue")) +
labs(x = "Mean Transcript Expression (log2 CPM)",
y = "log2 Fold Change",
colour = "Exclusivity",
shape = "Significant?") +
theme_bw() +
theme(axis.title = element_text(colour = "black", size = 14),
axis.text = element_text(colour = "black", size = 12),
legend.title = element_text(colour = "black", size = 14,
face = "bold"),
legend.text = element_text(colour = "black", size = 12))
kallisto_distinct_plot# Save these plots
ggsave(plot = kallisto_distinct_plot,
filename = "kallisto_distinct_plot.png",
path = here("figures/"),
height = 100,
width = 200,
units = "mm",
device = "png",
dpi = 400)star_distinct <- star_lmfit %>%
dplyr::select(transcript_id,
transcript_name,
AveExpr,
logFC,
P.Value,
FDR,
DE) %>%
dplyr::mutate(distinct = case_when(
!transcript_id %in% unique(
rbind(salmon_lmfit,
kallisto_lmfit,
hisat2_lmfit)[["transcript_id"]]
) ~ "Distinct",
.default = "Not Distinct"))
star_exclusive <- star_distinct %>%
dplyr::filter(distinct == "Distinct")
star_distinct_plot <- star_distinct %>%
ggplot(aes(x = AveExpr, y = logFC, colour = distinct, shape = DE)) +
geom_point() +
geom_point(data = star_exclusive) +
scale_colour_manual(values = c("navy", "lightblue")) +
labs(x = "Mean Transcript Expression (log2 CPM)",
y = "log2 Fold Change",
colour = "Exclusivity",
shape = "Significant?") +
theme_bw() +
theme(axis.title = element_text(colour = "black", size = 14),
axis.text = element_text(colour = "black", size = 12),
legend.title = element_text(colour = "black", size = 14,
face = "bold"),
legend.text = element_text(colour = "black", size = 12))
star_distinct_plot# Save these plots
ggsave(plot = star_distinct_plot,
filename = "star_distinct_plot.png",
path = here("figures/"),
height = 100,
width = 200,
units = "mm",
device = "png",
dpi = 400)salmon_distinct <- salmon_lmfit %>%
dplyr::select(transcript_id,
transcript_name,
AveExpr,
logFC,
P.Value,
FDR,
DE) %>%
dplyr::mutate(distinct = case_when(
!transcript_id %in% unique(
rbind(star_lmfit,
kallisto_lmfit,
hisat2_lmfit)[["transcript_id"]]
) ~ "Distinct",
.default = "Not Distinct"))
salmon_exclusive <- salmon_distinct %>%
dplyr::filter(distinct == "Distinct")
salmon_distinct_plot <- salmon_distinct %>%
ggplot(aes(x = AveExpr, y = logFC, colour = distinct, shape = DE)) +
geom_point() +
geom_point(data = salmon_exclusive) +
scale_colour_manual(values = c("navy", "lightblue")) +
labs(x = "Mean Transcript Expression (log2 CPM)",
y = "log2 Fold Change",
colour = "Exclusivity",
shape = "Significant?") +
theme_bw() +
theme(axis.title = element_text(colour = "black", size = 14),
axis.text = element_text(colour = "black", size = 12),
legend.title = element_text(colour = "black", size = 14,
face = "bold"),
legend.text = element_text(colour = "black", size = 12))
salmon_distinct_plot# Save these plots
ggsave(plot = salmon_distinct_plot,
filename = "salmon_distinct_plot.png",
path = here("figures/"),
height = 100,
width = 200,
units = "mm",
device = "png",
dpi = 400)tx_in_all_de <- hisat2_lmfit_sig %>%
inner_join(salmon_lmfit,
by = "transcript_id") %>%
inner_join(star_lmfit,
by = "transcript_id") %>%
inner_join(kallisto_lmfit,
by = "transcript_id")
# Get the hisat2 transcripts
hisat2_dte_exclusive <- hisat2_lmfit_sig %>%
dplyr::filter(DE == TRUE) %>%
dplyr::filter(!transcript_id %in% salmon_lmfit_sig$transcript_id) %>%
dplyr::filter(!transcript_id %in% star_lmfit_sig$transcript_id) %>%
dplyr::filter(!transcript_id %in% kallisto_lmfit_sig$transcript_id)
hisat2_dte_exclusive %>%
write_csv(here("data/hisat2_exclusive_tx.csv"))
# Get salmon transcripts
salmon_dte_exclusive <- salmon_lmfit_sig %>%
dplyr::filter(DE == TRUE) %>%
dplyr::filter(!transcript_id %in% hisat2_lmfit_sig$transcript_id) %>%
dplyr::filter(!transcript_id %in% star_lmfit_sig$transcript_id) %>%
dplyr::filter(!transcript_id %in% kallisto_lmfit_sig$transcript_id)
# Group to be used for kmer analysis
salmon_dte_exclusive %>%
head(28)## # A tibble: 28 × 24
## gene_id gene_length transcript_length seqnames start end width strand
## <chr> <dbl> <dbl> <chr> <dbl> <dbl> <dbl> <chr>
## 1 ENSG00000… 5944 4914 10 3.18e7 3.19e7 123378 -
## 2 ENSG00000… 15174 11501 11 1.97e7 2.01e7 408765 +
## 3 ENSG00000… 4476 4303 9 1.21e8 1.21e8 26781 -
## 4 ENSG00000… 6448 2427 1 1.51e8 1.51e8 64649 -
## 5 ENSG00000… 16474 9588 4 1.05e8 1.05e8 133018 +
## 6 ENSG00000… 14716 3092 12 6.45e7 6.45e7 49957 +
## 7 ENSG00000… 11558 4745 3 1.91e8 1.91e8 137430 +
## 8 ENSG00000… 4912 2489 19 7.28e5 7.48e5 20544 +
## 9 ENSG00000… 4341 3202 CHR_HSC… 3.22e7 3.22e7 5443 -
## 10 ENSG00000… 4849 1884 11 1.23e8 1.23e8 2889 -
## # ℹ 18 more rows
## # ℹ 16 more variables: tx_biotype <chr>, tx_cds_seq_start <dbl>,
## # tx_cds_seq_end <dbl>, tx_support_level <lgl>, tx_id_version <chr>,
## # gc_content <dbl>, tx_name <chr>, transcript_id <chr>,
## # transcript_name <chr>, logFC <dbl>, AveExpr <dbl>, t <dbl>, P.Value <dbl>,
## # FDR <dbl>, B <dbl>, DE <lgl>
salmon_dte_exclusive %>%
write_csv(here("data/salmon_exclusive_tx.csv"))
# Get SA transcripts
star_dte_exclusive <- star_lmfit_sig %>%
dplyr::filter(DE == TRUE) %>%
dplyr::filter(!transcript_id %in% hisat2_lmfit_sig$transcript_id) %>%
dplyr::filter(!transcript_id %in% salmon_lmfit_sig$transcript_id) %>%
dplyr::filter(!transcript_id %in% kallisto_lmfit_sig$transcript_id)
# Group to be used for kmer analysis
star_dte_exclusive %>%
head(28)## # A tibble: 28 × 24
## gene_id gene_length transcript_length seqnames start end width strand
## <chr> <dbl> <dbl> <chr> <dbl> <dbl> <dbl> <chr>
## 1 ENSG00000… 7926 449 12 5.35e7 5.35e7 1317 +
## 2 ENSG00000… 6710 1847 12 1.04e8 1.04e8 38633 +
## 3 ENSG00000… 7173 5175 12 1.21e8 1.21e8 60361 -
## 4 ENSG00000… 7392 5884 22 2.96e7 2.97e7 94978 +
## 5 ENSG00000… 14561 3780 10 3.29e7 3.30e7 57842 -
## 6 ENSG00000… 6821 1209 17 6.85e7 6.86e7 35920 +
## 7 ENSG00000… 7518 1009 11 1.02e8 1.02e8 19578 +
## 8 ENSG00000… 3785 1621 3 5.27e7 5.27e7 9128 -
## 9 ENSG00000… 2786 1691 22 3.71e7 3.71e7 11652 +
## 10 ENSG00000… 13755 4674 4 3.32e6 3.44e6 123844 +
## # ℹ 18 more rows
## # ℹ 16 more variables: tx_biotype <chr>, tx_cds_seq_start <dbl>,
## # tx_cds_seq_end <dbl>, tx_support_level <lgl>, tx_id_version <chr>,
## # gc_content <dbl>, tx_name <chr>, transcript_id <chr>,
## # transcript_name <chr>, logFC <dbl>, AveExpr <dbl>, t <dbl>, P.Value <dbl>,
## # FDR <dbl>, B <dbl>, DE <lgl>
star_dte_exclusive %>%
write_csv(here("data/star_exclusive_tx.csv"))
# Get Kallisto transcripts
kallisto_dte_exclusive <- kallisto_lmfit_sig %>%
dplyr::filter(DE == TRUE) %>%
dplyr::filter(!transcript_id %in% salmon_lmfit_sig$transcript_id) %>%
dplyr::filter(!transcript_id %in% star_lmfit_sig$transcript_id) %>%
dplyr::filter(!transcript_id %in% hisat2_lmfit_sig$transcript_id)
# Group to be used for kmer analysis
kallisto_dte_exclusive %>%
head(28)## # A tibble: 28 × 24
## gene_id gene_length transcript_length seqnames start end width strand
## <chr> <dbl> <dbl> <chr> <dbl> <dbl> <dbl> <chr>
## 1 ENSG00000… 6366 1336 7 1.05e8 1.05e8 24949 -
## 2 ENSG00000… 10605 8133 17 1.57e6 1.63e6 59567 -
## 3 ENSG00000… 7645 6007 6 1.11e8 1.11e8 205232 -
## 4 ENSG00000… 6644 4697 2 5.50e7 5.51e7 78263 -
## 5 ENSG00000… 1401 561 16 1.64e7 1.64e7 2975 +
## 6 ENSG00000… 7650 7230 CHR_HSC… 2.99e7 3.00e7 122345 -
## 7 ENSG00000… 5294 2704 1 1.59e8 1.59e8 45234 +
## 8 ENSG00000… 9981 1530 15 6.71e7 6.72e7 52894 +
## 9 ENSG00000… 29523 2822 20 4.73e5 5.44e5 71225 -
## 10 ENSG00000… 5349 3293 22 5.03e7 5.04e7 101044 +
## # ℹ 18 more rows
## # ℹ 16 more variables: tx_biotype <chr>, tx_cds_seq_start <dbl>,
## # tx_cds_seq_end <dbl>, tx_support_level <lgl>, tx_id_version <chr>,
## # gc_content <dbl>, tx_name <chr>, transcript_id <chr>,
## # transcript_name <chr>, logFC <dbl>, AveExpr <dbl>, t <dbl>, P.Value <dbl>,
## # FDR <dbl>, B <dbl>, DE <lgl>
kallisto_dte_exclusive %>%
write_csv(here("data/kallisto_exclusive_tx.csv"))
# Get transcripts found in all methods
all_methods_dte <- hisat2_lmfit_sig %>%
dplyr::filter(transcript_id %in% salmon_lmfit_sig$transcript_id) %>%
dplyr::filter(transcript_id %in% star_lmfit_sig$transcript_id) %>%
dplyr::filter(transcript_id %in% kallisto_lmfit_sig$transcript_id) %>%
dplyr::filter(DE == TRUE)
all_methods_dte %>%
write_csv(here("data/all_methods_tx.csv"))tx_in_all_dtu <- hisat2_dtu_sig %>%
inner_join(salmon_dtu_sig,
by = "Transcript") %>%
inner_join(star_dtu_sig,
by = "Transcript") %>%
inner_join(kallisto_dtu_sig,
by = "Transcript")
hisat2_dtu_exclusive <- hisat2_dtu_sig %>%
dplyr::filter(!Transcript %in% salmon_dtu_sig$Transcript) %>%
dplyr::filter(!Transcript %in% star_dtu_sig$Transcript) %>%
dplyr::filter(!Transcript %in% kallisto_dtu_sig$Transcript)
hisat2_dtu_exclusive %>%
write_csv(here("data/hisat2_exclusive_dtu.csv"))
salmon_dtu_exclusive <- salmon_dtu_sig %>%
dplyr::filter(!Transcript %in% hisat2_dtu_sig$Transcript) %>%
dplyr::filter(!Transcript %in% star_dtu_sig$Transcript) %>%
dplyr::filter(!Transcript %in% kallisto_dtu_sig$Transcript)
salmon_dtu_exclusive %>%
write_csv(here("data/salmon_exclusive_dtu.csv"))
star_dtu_exclusive <- star_dtu_sig %>%
dplyr::filter(!Transcript %in% hisat2_dtu_sig$Transcript) %>%
dplyr::filter(!Transcript %in% salmon_dtu_sig$Transcript) %>%
dplyr::filter(!Transcript %in% kallisto_dtu_sig$Transcript)
star_dtu_exclusive %>%
write_csv(here("data/star_exclusive_dtu.csv"))
kallisto_dtu_exclusive <- kallisto_dtu_sig %>%
dplyr::filter(!Transcript %in% salmon_dtu_sig$Transcript) %>%
dplyr::filter(!Transcript %in% star_dtu_sig$Transcript) %>%
dplyr::filter(!Transcript %in% hisat2_dtu_sig$Transcript)
kallisto_dtu_exclusive %>%
write_csv(here("data/kallisto_exclusive_dtu.csv"))
all_methods_dtu <- hisat2_dtu_sig %>%
dplyr::filter(Transcript %in% salmon_dtu_sig$Transcript) %>%
dplyr::filter(Transcript %in% star_dtu_sig$Transcript) %>%
dplyr::filter(Transcript %in% kallisto_dtu_sig$Transcript)
all_methods_dtu %>%
write_csv(here("data/all_methods_tx.csv"))Investigate PCBP2
pcbp2_comp_df <- full_join(
hisat2_lmfit %>%
dplyr::select(transcript_name,
"logFC_hisat2" = logFC,
"FDR_hisat2" = FDR) %>%
dplyr::filter(str_detect(transcript_name, "PCBP2")),
kallisto_lmfit %>%
dplyr::select(transcript_name,
"logFC_kallisto" = logFC,
"FDR_kallisto" = FDR) %>%
dplyr::filter(str_detect(transcript_name, "PCBP2")),
by = "transcript_name") %>%
full_join(salmon_lmfit %>%
dplyr::select(transcript_name,
"logFC_salmon" = logFC,
"FDR_salmon" = FDR) %>%
dplyr::filter(str_detect(transcript_name, "PCBP2")),
by = "transcript_name") %>%
full_join(star_lmfit %>%
dplyr::select(transcript_name,
"logFC_STAR" = logFC,
"FDR_STAR" = FDR) %>%
dplyr::filter(str_detect(transcript_name, "PCBP2")),
by = "transcript_name")
logfc_pcbp2 <- pcbp2_comp_df %>%
pivot_longer(cols = starts_with("logFC"),
values_to = "logFC",
names_to = "group") %>%
ggplot(aes(x = group, y = logFC, colour = group)) +
geom_point(size = 3) +
scale_colour_manual(values = c("#61c3d7", "#4fc14d",
"#bf4dc1", "#f6866f")) +
facet_wrap(~ transcript_name,
nrow = 5, ncol = 5) +
theme_bw() +
theme(axis.text.x = element_text(size = 12, colour = "black", angle = 90))
ggsave(plot = logfc_pcbp2,
filename = "logfc_pcbp2.png",
path = here("figures/"),
height = 300,
width = 320,
units = "mm",
device = "png",
dpi = 400)
fdr_pcbp2 <- pcbp2_comp_df %>%
pivot_longer(cols = starts_with("FDR"),
values_to = "FDR",
names_to = "group") %>%
ggplot(aes(x = group, y = -log2(FDR), colour = group)) +
geom_point(size = 3) +
scale_colour_manual(values = c("#61c3d7", "#4fc14d",
"#bf4dc1", "#f6866f")) +
facet_wrap(~ transcript_name,
nrow = 5, ncol = 5) +
theme_bw() +
theme(axis.text.x = element_text(size = 12, colour = "black", angle = 90))
fdr_pcbp2Save the plot